options(scipen = 999)
flights <- as.data.frame(nycflights13::flights)
wheather <- nycflights13::weather
airports <- nycflights13::airports
planes <- nycflights13::planes
Sys.setlocale("LC_ALL", "English")
## [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
The first graph shows us the percentage of flights that departed over a 15-minute delay divided into each destination individually. The purpose of this graph is to make the reader’s eye quickly find the percentage of flights based on a visualization of the United States map.
The second graph shows the weekly flight cycles throughout 2008 where the blue line indicates the total number of flights that took off that day and the red line indicates the number of flights in which there was a delay of over 15 minutes.
In the first graph the information is not presented in an ideal way, it is difficult for the reader to identify where each line is coming from, although it is possible to understand which countries it is but it is not always possible to identify exactly which airport it is.
In the second graph, in our opinion, the amount of lines and points confuses the reader and makes it difficult for him to identify the patterns of flight delays throughout the seasons. In addition, the information overload makes it difficult to identify recurring latency patterns on certain days for the length of the weekly / monthly cycle.
The first graph raises the obvious question: Are there countries where the average latency is higher than in other countries? In addition, which are the airports where the percentage of delays is higher? Another question is whether the percentages of delay measured vary according to the seasons?
The second graph raises the questions: Are there patterns of lateness that recur on certain days of the week or in a particular season, or are there lateness in some months than in other months? In addition, if the observation was lower by day / month, it would be possible to compare percentages and learn from the data by comparing proportions.
The first graph can be improved by changing the graph to a dynamic graph. This means that when the mouse points to a certain line, we will be presented with the values of the airport we are pointing to.
In the second graph it was possible to improve the visualization by changing the red bars to a long line connecting the dots which makes it possible to identify patterns in the delays of the flights.
A graphic summarizing the flight volume and flights delayed, broken by day and showing weekly cycles.
#EWR/ LGA/ JFK to choose
week_cycles <- as.data.frame(flights) %>% filter(origin == 'EWR') %>% dplyr::select(time_hour ,dep_delay ,sched_dep_time)
week_cycles$time_hour <- as.Date(week_cycles$time_hour,'ddmmmyyyy')
week_cycles<- week_cycles %>% mutate(delay = if_else(dep_delay <= 15 ,0,1,0))
week_cycles_freq <- week_cycles %>% group_by(time_hour) %>% summarise(frequency = n())
week_cycles_delay <- week_cycles %>% group_by(time_hour) %>% summarise(sum(delay))
l <- length(week_cycles_freq$frequency)
week_cycles_min <- week_cycles_freq[c(2:(l-1)),]
week_cycles_min <- week_cycles_min %>% mutate(local_min = (week_cycles_min$frequency == runmin(week_cycles_min$frequency,length(week_cycles_min$frequency)/1.3)))
week_cycles_min <- week_cycles_min %>% filter(local_min == T) %>% filter(frequency < 250)
p <- ggplot(data = week_cycles_freq,aes(x = time_hour,
y= week_cycles_freq$frequency,colour = "blue")) +
geom_point() + geom_line(aes(color = "blue")) +
geom_point(data = week_cycles_delay,aes(x=time_hour,y= `sum(delay)`, color ='red')) +
geom_linerange(data = week_cycles_delay,aes(x= time_hour, ymax =`sum(delay)` ,ymin=0),color = 'red') +
geom_point(data = week_cycles_min,aes(x=time_hour,y=week_cycles_min$frequency, color = "royalblue2"),size = 7)+
scale_color_identity(name ="" ,breaks = c("blue","red"),
labels = c("All Flights (scheculed for departure)",
"Late Flights (departure delayed >15)"),
guide = guide_legend(override.aes = list(linetype = c(1, 1), shape = c(16,16),size = c(1,1))))+
theme_light() +
xlab("")+ ylab("Flights \n per day") +
xlim(as.Date(c("2013-01-01","2014-01-01")))+ ylim(c(0, 400)) +
theme(axis.title.y = element_text(angle = 0, vjust = 1.1, hjust = 10)) +
scale_x_date(date_labels = "%d%b%Y", date_breaks = "1 month",expand = c(0,0)) +
theme(legend.position="top",legend.direction = "vertical") +
ggtitle(label = "Weekly Cycles", subtitle = "the airport = EWR Year = 2013") +
theme(plot.title = element_text(hjust = 0.5,size = 20, face = "bold"),
plot.subtitle = element_text(hjust = 0.5,size = 15, face = "bold"),
legend.text=element_text(size=12,face = "bold"))
d <- get_legend(p + scale_color_identity(name ="\n\n WEEKLY CYCLES \n FEW FLIGHTS: \n SUNDAYS AND SATURDAYS \n MOST FLIGHTS: \n MONDAYS, THURSDAYS \n AND FRIDAYS" ,
breaks = c("royalblue2"), labels = c("Fewer flights on holidays"),
guide = guide_legend(override.aes = list(linetype = c(0), shape = c(16),size = c(7)))) +
theme(legend.text=element_text(size=12, face = "bold"),
legend.title=element_text(size=12, face = "bold")))
grid.arrange(d, p, layout_matrix = matrix(c(1,2,2, 2, 2), ncol = 5),
top = textGrob(" TEMPORAL EFFECTS",
gp=gpar(fontsize=45, col = "royalblue2", fontface = 2)))
The other graph, flight percent
dep_delay <- as.data.frame(flights) %>% filter(origin == 'JFK') %>% group_by(dest) %>%
count()
colnames(dep_delay) <-c("faa", "amount")
dep_delay2 <- as.data.frame(flights) %>% filter(origin == 'JFK') %>% group_by(dest) %>% filter(dep_delay >15) %>% count()
colnames(dep_delay2) <- c("faa","delay")
dep_delay <- dep_delay %>% left_join(dep_delay2) %>% mutate(per = delay/ amount)
dep_delay_loc <- dep_delay %>% left_join(airports, by = "faa") %>% drop_na()
# removed : STT,SJU, BQN need to be explain why they can be found.
dep_delay_loc <- dep_delay_loc %>% mutate(per_ch = NA)
dep_delay_loc$per_ch[dep_delay_loc$per <= 0.10 ] <- "<= 10%"
dep_delay_loc$per_ch[dep_delay_loc$per <= 0.15 & dep_delay_loc$per > 0.10 ] <- "10% - 15%"
dep_delay_loc$per_ch[dep_delay_loc$per <= 0.20 & dep_delay_loc$per > 0.15 ] <- "15% - 20%"
dep_delay_loc$per_ch[dep_delay_loc$per <= 0.25 & dep_delay_loc$per > 0.20 ] <- "20% - 25%"
dep_delay_loc$per_ch[dep_delay_loc$per >= 0.25] <- " > 25%"
dep_delay_loc$per_ch <- factor(dep_delay_loc$per_ch, levels = c("<= 10%","10% - 15%",
"15% - 20%","20% - 25%"," > 25%"))
dep_delay_loc <- dep_delay_loc %>% mutate(nyc_EWR_lon = -74.16867) %>%
mutate(nyc_EWR_lat = 40.6925 )
dep_delay_loc <- as.data.frame(dep_delay_loc)
dep_delay_trans <- dep_delay_loc %>%
select(lon,lat) %>% usmap_transform()
dep_delay_loc$lon <- dep_delay_trans$lon.1
dep_delay_loc$lat <- dep_delay_trans$lat.1
dep_delay_trans <- dep_delay_loc %>% select(nyc_EWR_lon,nyc_EWR_lat) %>% usmap_transform()
dep_delay_loc$lnyc_EWR_lon <- dep_delay_trans$nyc_EWR_lon.1
dep_delay_loc$lnyc_EWR_lat <- dep_delay_trans$nyc_EWR_lat.1
row.names(dep_delay_loc) <- dep_delay_loc$name
plot_del <- plot_usmap(regions = "states",labels = TRUE,exclude = c("AK","HI"), size = 0.5,
label_color = "grey",
color = "grey") +
theme(panel.background=element_blank()) +
geom_segment(data = dep_delay_loc,
aes(xend = lon,yend = lat, x = lnyc_EWR_lon,
y = lnyc_EWR_lat ,colour = per_ch),size = 0.5) +
geom_point(data = dep_delay_loc, shape = 1,
aes(x = lon , y = lat, colour = per_ch,
text = row.names(dep_delay_loc)),
size = 1.25, show.legend = F) +
scale_color_manual(name = "",
values = c("green","purple","blue","orange","red"),
guide = guide_legend(override.aes = list(linetype = c(1, 1, 1,1,1),
size = c(2,2,2,2,2))))+
labs(title = '<b>% of Flight Departures Delayed > 15 Min</b><br> Airport = JFK Year = 2013<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br>Click line endpoint to see that airports departures') +
theme(plot.title = element_text(hjust = 0.5,size = 12))
ggplotly(plot_del, tooltip = "text") %>% layout(legend = list(x = -0.1, y = -0.1))
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
The first graph shows the relationship between visibility and flight delays.
flights_delayed <- filter(flights, dep_delay >15 ) #We want to see only the delayed flights
flights_delayed1 <- flights_delayed %>% group_by(time_hour) %>% summarise(Avg_dep_delay = mean(dep_delay))
Wheather_cond <- left_join(flights_delayed1, wheather) #combine the two relevant data frames
#aggregate the data so we can see the conection between visib to avg sep delay.
visib_df <- Wheather_cond %>% group_by(visib) %>% summarise(Avg_dep_delay = mean(Avg_dep_delay, na.rm = T), n = n())
visib_df <- visib_df %>% filter(n > 30)
ggplot(data = visib_df, mapping = aes(x = visib, y = Avg_dep_delay)) + geom_point(color = "violetred4") +
geom_smooth(method = "lm", fill = "violet", color = "violetred4") +
labs(x = "visibility", y="Average Departure Delay Time (minutes)", title = "Average Departure Delay vs. Quality of visibility")
When we tried to think what was the main reason for the delay of flights the first thing that came to our mind was the weather conditions. When we studied the weather database we noticed a variable called “visib” and decided to try to understand the relationship between visibility and flight delays. The graph shown above shows a direct relationship between the quality of visibility and the average flight delay in minutes. The better the visibility, the smaller the average delays.
#As before we summarise the average delay, only this time we compare it to the wind speed.
by_wind_speed <- Wheather_cond %>% group_by(wind_speed) %>% summarise(Avg_dep_delay = mean(Avg_dep_delay, na.rm = T), n = n())
#we filtered average values of less then 30 samples.
by_wind_speed <- filter(by_wind_speed, n>30)
ggplot(data = by_wind_speed, mapping = aes(x = wind_speed, y = Avg_dep_delay)) +
geom_point(color = "dodgerblue4") +
geom_smooth(method = "lm", fill = "deepskyblue", color = "dodgerblue4") +
labs(x = "Wind_speed (mph)", y="Average Departure Delay Time (minutes)",
title = "Average Departure Delay vs. Wind speed")
The second graph shows us another interesting relationship between the weather and the average of flight delays. In the graph above it we can see positive linear conection between the wind speed and the average flight delay in minutes. As the wind speed increases the delay time of the flight also increases.
In this question we were asked to check whether delayed-departure has a seasonal pattern using Graphical Lineup. Our null hypothesis is: - The proportion of delayed flights per month is independent across months.
#Creat a plot that shows the distribution of the delayed flights in each day of every month
ggplot(data = flights_delayed, aes(x = day)) +
geom_bar(aes(fill = month))+ facet_wrap(~month) +
labs(title = "Distribution of delayed flight in each month",
subtitle = "(Delayed flights are set to be flights that left the airport with over 15 minutes delay)", y = "Number of delayed flight", x = "Day") + scale_fill_viridis_c(guide = F)
In this question we were asked to produce a graph which answers the question: Is the proportion of delayed flights per month is independent across months. To this end, we have created a graph that shows for each month the distribution of flights that have been delayed for more than 15 minutes.
After thoroughly researching the graph findings we came to the conclusion that there is no relationship between the distribution of the delayed flights.
As a result of this conclusion we can say that the months are independent of each other and therefore we will reject the null hypothesis.
stop! Try to think which of the graphs you see below shows the true distribution of the number of flights that were delayed each month?
#First' we created a df that containes only the data that we need
by_month <- flights_delayed %>% group_by(month) %>% summarise(val = n())
by_month <- data.frame(by_month, semp = c(rep(1,12))) #named the real data as sample 1.
sampled_df <- by_month
row.names(sampled_df) <- by_month$month
#loop that sample new vecrots and tag the sample with number
#also each sample will merged to the data that containes the real values.
for (i in (2:9)){
x <- data.frame(month = c(1:12), semp = c(rep(i,12)),val = sample(by_month$val))
sampled_df <- full_join(sampled_df, x)
}
#plot all of the samples in the Line-up method
# we added colors that will help us identify the pattern of the values.
ggplot(data = sampled_df, mapping = aes(x = month, y = val, fill = val)) + facet_wrap(~semp) +
scale_x_continuous(breaks = c(1:12)) +
geom_bar(stat="identity") +
labs(title = "Simulated data-sets of delayed flight using line-up",
subtitle = "(Delayed flights are set to be flights that left the airport with over 15 minutes delay)",
y = "Number of delayed flight", x = "Month")+
theme(axis.text.x = element_text(size = 8)) +
scale_fill_distiller(palette = 'RdPu', direction =1)
In this question we were again asked to try to reject the null hypothesis but this time by creating graphs which are based on simulations of the data.
First, we checked the number of flights delayed each month. After that we created a vector that represent this values. The next step was to create simulation based on this values. we created a graph for every simulation and tried to see if there is any connection between all of this samples.
So what are you thinking ? Which of the graphs above is the one that shows the real data distribution? The real data is displayed in graph number ….. 1!
While we created the graph above we choose to mach colors for each bar that represent the numbers of delayed flights, small number will be represented with lighter colors and larger numbers with darker colors.
As we can see in sample 1 that represent the real values there is a certain trend between the months, the transition between the colors of the bars is an easy transition in most cases. There is an upward trend and there is also a downward trend in the real data values.
When we look at the other graphs that represent the simulations we fail to notice a similar trend in any of the graphs. As a result we can say that the relationship between the months is not a Random relationship and therefore the variance is not Random variance. The conclusion is that the months are not independent so we will reject the null hypothesis
Finally, we learned that we can learn about relationships between variables not necessarily with the help of formulas and numbers. Visual objects can also provide us with a great deal of statistical information.